library(stm)        # Package for sturctural topic modeling
library(igraph)     # Package for network analysis and visualisation
library(stmCorrViz) # Package for hierarchical correlation view of STMs
library(dplyr)
library(ggplot2)
library(tm)
library(broom)
library(tidyr)
library(tidytext)
library(wordcloud)
library(anytime)
library(tidystm)
library(RColorBrewer)
library(scales)

reorder_within <- function(x, by, within, fun = mean, sep = "___", ...) {
  new_x <- paste(x, within, sep = sep)
  stats::reorder(new_x, by, FUN = fun)
}


#' @rdname reorder_within
#' @export
scale_x_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_x_discrete(labels = function(x) gsub(reg, "", x), ...)
}


#' @rdname reorder_within
#' @export
scale_y_reordered <- function(..., sep = "___") {
  reg <- paste0(sep, ".+$")
  ggplot2::scale_y_discrete(labels = function(x) gsub(reg, "", x), ...)
}

# Read in the data
raw_data <-  read.csv("mysql_results/gun_org_data_from_2017_01_01_to_2018_04_01.csv", sep="\t", encoding = 'UTF-8')

# Clean up the text data for topic modeling
limited_df <-  select(raw_data, tweet_id, name, tweet_date, tweet_text, tweet_epochtime, gun_control)
cleaned_df <-  limited_df %>%
  mutate(clean_text = gsub("https[^ ]*", " ", tweet_text)) %>%
  mutate(clean_text = gsub("[^A-Za-z0-9 #]", "", clean_text))

unique_df <- unique(cleaned_df)

unique_tweets = unique(unique_df$clean_text)
clean_df <- unique_df[!duplicated(unique_df$clean_text), ]
clean_df <- mutate(clean_df, simple_date = anydate(tweet_epochtime))

parkland_df <- clean_df %>%
  filter(tweet_epochtime >= 1517472000 & tweet_epochtime <= 1522566000)

############ Building Corpus for topic model

#Prepare
processed <- textProcessor(parkland_df$clean_text, metadata=parkland_df, customstopwords = c('amp'))
out <- prepDocuments(processed$documents, processed$vocab, processed$meta)
docs <- out$documents
vocab <- out$vocab
meta <- out$meta

plotRemoved(processed$documents, lower.thresh=seq(1,200, by=100))

############### Estimate single model ######################
poliblogPrevFit <- stm(out$documents, out$vocab, K=31, 
                       prevalence=~name+gun_control,
                       data=out$meta, init.type="Spectral", seed=8458159)

plot(poliblogPrevFit, type="summary", xlim=c(0,.4))
plot(poliblogPrevFit, type="labels", topics=c(3,7,20))
plot(poliblogPrevFit, type="hist")
plot(poliblogPrevFit, type="perspectives", topics=c(7,10))

############## Running multiple models with different numbers of topics #############
storage <- manyTopics(out$documents, out$vocab, K=c(5:25), 
                      prevalence=~name+gun_control,
                      data=meta, runs=10, seed=8458159)
storageOutput1 <- storage$out[[1]] # For example, choosing the model with 7 topics

############## Data driven appoach to multiple topics
kResult <- searchK(out$documents, out$vocab, K=c(5:25), 
                   prevalence=~name+gun_control, 
                   max.em.its=50, data=meta)
kResult2 <- searchK(out$documents, out$vocab, K=c(25:50), 
                    max.em.its=50, data=meta)
plot(kResult)
plot(kResult2)

num_topics <- 32# Select best number from metrics above

############## Running multiple models for semantic coherence  ##################
poliblogSelect <- selectModel(out$documents, out$vocab, K=num_topics, 
                              prevalence=~name+gun_control, data=meta, runs=20, seed=8458159)

poliblogSelectContent <- selectModel(out$documents, out$vocab, K=num_topics, 
                              content = ~gun_control, 
                              prevalence=~name+gun_control, 
                              max.em.its=100, data=meta, runs=20, seed=8458159)

plotModels(poliblogSelect)


selectedModel <- poliblogSelect$runout[[4]] # Choose best model from selection function


###################### Work with chosen stored model#########################
chosen_model = selectedModel


# Text output for labeling
numwordsout = 10
results <-  labelTopics(chosen_model, n = numwordsout)

probs <- results$prob
frex <- results$frex
score <- results$score

colnames(probs) <- paste("prob", 1:numwordsout, sep = "")
colnames(frex) <- paste("frex", 1:numwordsout, sep = "")
colnames(score) <- paste("score", 1:numwordsout, sep = "")

out_df = cbind(frex, probs, score)
#write.csv(out_df, "E:/digiadv/data/gun_images/validation/topicmodel_storage_validation.csv")
#write.csv(out_df, "C:/Users/weszu/Documents/GitHub/twitter_scrape/data/topic_model_results/topicmodel_storage_validation.csv")
write.csv(out_df, "data/rebuilt_topic_model.csv")



# Graphing aspects of model
plot(chosen_model)

td_beta <- tidy(chosen_model)

td_beta %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  mutate(topic = paste0("Topic ", topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  labs(x = NULL, y = expression(beta),
       title = "Highest word probabilities for each topic",
       subtitle = "Different words are associated with different topics")


td_gamma <- tidy(chosen_model, matrix = "gamma",                    
                 document_names = rownames(clean_df))

ggplot(td_gamma, aes(gamma, fill = as.factor(topic))) +
  geom_histogram(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~ topic, ncol = 3) +
  labs(title = "Distribution of document probabilities for each topic",
       subtitle = "Each topic is associated strongly with only a small number of tweets",
       y = "Number of tweets", x = expression(gamma))

# Github exploration
plot(chosen_model, type="summary", xlim=c(0,.4))
topicQuality(model=chosen_model, documents=docs)
plot(chosen_model, type="labels", topics=c(19, 18, 12, 3))
plot(chosen_model, type="hist")
plot(chosen_model, type="perspectives", topics=c(1, 2))

# Topic Quality
topicQuality(model=chosen_model, documents=docs)

# Examples of top loading docs
thoughts <- findThoughts(chosen_model, texts=meta$clean_text, n=7, topics=32)$docs[[1]]
plotQuote(thoughts, width=90, main="Top Loading Documents for Topics")

original_labels <- c('No Label','NoNotoriety', 'Newtown', 'Police', 'Hunting', 'Gun Violence', 'MarchForOurLives', 'Walkouts', 'Government', 'NRA', 'MomsDemand', 'Gun rights', 'Federal Courts', 'Ammunition Ads', 'Background Checks', 'Mass Shootings', 'Donations', 'Twitter Activity', 'Assault Weapons', '2nd Amendment', 'Guns', 'BackfireTrump', 'Amazon', 'NRA TV', 'Domestic Violence', 'Elections', 'CCR Act', 'Arming Teachers', 'Gun Accessory Ads', 'No Label', 'Gun Law Effects')
parkland_labels <- c('Congress', 'Membership Mobilization', 'Protests', 'Background Checks', 'Twitter Accusations', 'Gun Rights', 'Elected Officials', 'Assault Weapons', 'Arming Teachers', 'Russia and NRA', 'Health (Mental and Public)', 'Twitter Conversation', 'Guns and Accessories', 'Remembrance and NoNotoriety', 'Safety, Bullying', 'Courts', 'Birthdays and Holidays', 'School Shootings', 'Trump', 'Gun Violence Trends', 'LGBTQ', 'Names', 'Gun Control', 'Concealed Carry', 'Hashtags', 'UNCLEAR', 'Domestic Violence', 'Pride Fund', 'UNCLEAR', 'Discussions', 'Leadership', 'Marchforourlives')

selected_labels <- parkland_labels

# Point Estimates USING GGPLOT
gun_position_df <- unique(select(clean_df, name, gun_control))
prep0 <- estimateEffect(1:num_topics ~ name, chosen_model, meta=out$meta, 
                       uncertainty="Global")
first_effect <- extract.estimateEffect(prep0, "name", model = chosen_model, method = "pointestimate")
effect <- left_join(first_effect, gun_position_df, by = c('covariate.value' = 'name'))

filter(effect, covariate.value == 'NRA') %>%
  ggplot(aes(y = estimate, x = topic)) + geom_point() + 
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = .1) +
  coord_flip(ylim = c(-.01, .25)) +
  geom_text(aes(y = ci.upper, label=selected_labels),hjust=-.1, vjust=.3) +
  geom_hline(yintercept=0) +
  labs(title="Topic Probability Estimates for NRA", x="Topic Number", y = "Topic Probability")

filter(effect, covariate.value == 'The Truth About Guns') %>%
  ggplot(aes(y = estimate, x = topic)) + geom_point() + 
  geom_errorbar(aes(ymin = ci.lower, ymax = ci.upper), width = .1) +
  coord_flip(ylim = c(-.01, .25)) +
  geom_text(aes(y = ci.upper, label=selected_labels),hjust=-.1, vjust=.3) +
  geom_hline(yintercept=0) +
  labs(title="Topic Probability Estimates for The Truth About Guns", x="Topic Number", y = "Topic Probability")

# Comparison Figures

filter(effect, gun_control == 0) %>%
  ggplot(aes(y = estimate, x = factor(topic), col = topic)) + geom_point() + 
  geom_errorbar(aes(ymin = ci.lower, ymax = estimate), width = .1) +
  coord_flip(ylim = c(-.00001, .5)) +
  scale_x_discrete(breaks = 1:num_topics, labels=selected_labels) +
  xlab(NULL) + 
  facet_wrap(~covariate.value) +
  geom_hline(yintercept=0) +
  scale_color_gradient(low='orange', high='blue') +
  labs(title="Topic Probability Estimates for Gun Rights Organizations", x="Topic Number", y = "Topic Probability")

filter(effect, gun_control == 0) %>%
  filter(covariate.value == 'NRA' | covariate.value == 'Ammo Land' | covariate.value== 'Gun owners of America' | covariate.value == 'Dana Loesch') %>%
  ggplot(aes(y = estimate, x = factor(topic), col = topic)) + geom_point() + 
  geom_errorbar(aes(ymin = ci.lower, ymax = estimate), width = .1) +
  coord_flip(ylim = c(-.00001, .25)) +
  scale_x_discrete(breaks = 1:num_topics, labels=selected_labels) +
  xlab(NULL) + 
  facet_wrap(~covariate.value) +
  geom_hline(yintercept=0) +
  scale_color_gradient(low='orange', high='blue') +
  labs(title="Topic Probability Estimates for Select Gun Rights Organizations", x="Topic Number", y = "Topic Probability")

filter(effect, gun_control == 1) %>%
  ggplot(aes(y = estimate, x = factor(topic), col = topic)) + geom_point() + 
  geom_errorbar(aes(ymin = ci.lower, ymax = estimate), width = .1) +
  coord_flip(ylim = c(-.00001, .5)) +
  scale_x_discrete(breaks = 1:num_topics, labels=selected_labels) +
  xlab(NULL) + 
  facet_wrap(~covariate.value) +
  geom_hline(yintercept=0) +
  scale_color_gradient(low='orange', high='blue') +
  labs(title="Topic Probability Estimates for Gun Control Organizations", x="Topic Number", y = "Topic Probability")
  
filter(effect, gun_control == 1) %>%
  filter(covariate.value == 'Brady Campaign to Prevent Gun Violence (BCPGV)' | covariate.value == 'Moms demand action for gun sense in America' | covariate.value== 'Michael Bloomberg' | covariate.value == 'Shannon Watts') %>%
  ggplot(aes(y = estimate, x = factor(topic), col = topic)) + geom_point() + 
  geom_errorbar(aes(ymin = ci.lower, ymax = estimate), width = .1) +
  coord_flip(ylim = c(-.00001, .25)) +
  scale_x_discrete(breaks = 1:num_topics, labels=selected_labels) +
  xlab(NULL) + 
  facet_wrap(~covariate.value) +
  geom_hline(yintercept=0) +
  scale_color_gradient(low='orange', high='blue') +
  labs(title="Topic Probability Estimates for Select Gun Control Organizations", x="Topic Number", y = "Topic Probability")

par(mfrow=c(1,1)) 
plot(prep0, covariate="name", 
     topics=c(2), 
     model=chosen_model, 
     method="pointestimate", 
     xlab="Predicted Probability of topic loading for group", 
     main="Topic 2: No Notoriety",
     width = 50,
     xlim=c(-.5, .5), labeltype ="custom", 
     custom.labels=unique(out$meta$name))
plot(prep0, covariate="name", 
             topics=c(7), 
             model=chosen_model, 
             method="pointestimate", 
             xlab="Predicted Probability of topic loading for group", 
     main="Topic 7: MarchforourLives",
     width = 50,
             xlim=c(-.5, .5), labeltype ="custom", 
             custom.labels=unique(out$meta$name))
plot(prep0, covariate="name", 
     topics=c(29), 
     model=chosen_model, 
     method="pointestimate", 
     xlab="Predicted Probability of topic loading for group", 
     main="Topic 29: Gun Accessories",
     width = 50,
     xlim=c(-.5, .5), labeltype ="custom", 
     custom.labels=unique(out$meta$name))
plot(prep0, covariate="name", 
     topics=c(31), 
     model=chosen_model, 
     method="pointestimate", 
     xlab="Predicted Probability of topic loading for group", 
     main="Topic 31: Gun Law Effects",
     width = 50,
     xlim=c(-.5, .5), labeltype ="custom", 
     custom.labels=unique(out$meta$name))

# Figuring out Unclear Topics
unclear_topic = 29
cloud(chosen_model, topic=unclear_topic)
thoughts <- findThoughts(chosen_model, texts=meta$clean_text, n=10, topics=unclear_topic)$docs[[1]]
plotQuote(thoughts, width=90, main="Top Loading Documents for Topics")

plot(prep0, covariate="name", 
     topics=c(unclear_topic), 
     model=chosen_model, 
     method="pointestimate", 
     xlab="Predicted Probability of topic loading for group", 
     main="Unclear Topics",
     width = 50,
     xlim=c(-.5, .5), labeltype ="custom", 
     custom.labels=unique(out$meta$name))

# Covariates - binary
prep <- estimateEffect(1:num_topics ~ gun_control, chosen_model, meta=out$meta, 
                       uncertainty="Global")

par(mfrow=c(1,1)) 
plot(prep, covariate="gun_control", 
     topics=c(1:num_topics), 
     model=chosen_model, 
     method="difference", 
     cov.value1="1", 
     cov.value2="0",
     xlab="More Gun Rights ... More Gun Control", 
     main="Effect of Gun Control vs. Gun Rights",
     xlim=c(-.25,.25), labeltype ="custom", 
     custom.labels=selected_labels)

### Covariates - categorical
prep2 <- estimateEffect(1:num_topics ~ name, chosen_model, meta=out$meta, 
                        uncertainty="Global")
#Watts vs Loesch
plot(prep2, covariate="name", 
     topics=c(1:num_topics), 
     model=chosen_model, 
     method="difference", 
     cov.value1="Shannon Watts", 
     cov.value2="Dana Loesch",
     xlab="Dana Loesch ... Shannon Watts", 
     main="Comparing Gun Groups",
     xlim=c(-.25,.25), labeltype ="custom", 
     custom.labels=selected_labels)
#Moms Demand vs NRA
plot(prep2, covariate="name", 
     topics=c(1:num_topics), 
     model=chosen_model, 
     method="difference", 
     cov.value1="Moms demand action for gun sense in America", 
     cov.value2="NRA",
     xlab="NRA ... Moms Demand", 
     main="Comparing Gun Groups",
     xlim=c(-.25,.25), labeltype ="custom", 
     custom.labels=selected_labels)
# Moms Demand vs Brady
plot(prep2, covariate="name", 
     topics=c(1:num_topics), 
     model=chosen_model, 
     method="difference", 
     cov.value1="Moms demand action for gun sense in America", 
     cov.value2="Brady Campaign to Prevent Gun Violence (BCPGV)",
     xlab="Brady Campaign ... Moms Demand", 
     main="Comparing Gun Groups",
     xlim=c(-.15,.15), labeltype ="custom", 
     custom.labels=selected_labels)

# NRA vs Ammo Land
plot(prep2, covariate="name", 
     topics=c(1:num_topics), 
     model=chosen_model, 
     method="difference", 
     cov.value1="NRA", 
     cov.value2="Ammo Land",
     xlab="Ammo Land ... NRA", 
     main="Comparing Gun Groups",
     xlim=c(-.2,.2), labeltype ="custom", 
     custom.labels=selected_labels)

### Covariates - Continuous
out$meta$numeric_date <- as.numeric(out$meta$simple_date)
prep3 <- estimateEffect(1:num_topics ~ s(numeric_date)*name, chosen_model, meta=out$meta, 
                       uncertainty="Global")
prep4 <- estimateEffect(1:num_topics ~ s(numeric_date)*gun_control, chosen_model, meta=out$meta, 
                        uncertainty="Global")

# Creating TidySTM run for graphing with GGPlot
time_effect <- lapply(c('Brady Campaign to Prevent Gun Violence (BCPGV)', "Moms demand action for gun sense in America"), function(i) {extract.estimateEffect(x = prep3,
                         covariate = "numeric_date",
                         method = "continuous",
                         model = chosen_model,
                         labeltype = "custom",
                         custom.labels = selected_labels,
                         n = 6,
                         moderator = "name",
                         moderator.value = i)
})
time_effect <- do.call("rbind", time_effect)
time_effect$month <- as.Date(time_effect$covariate.value, origin = "1970-01-01")

# Plot with GG Plot
ggplot(time_effect, aes(x = month, y = estimate,
                   ymin = ci.lower, ymax = ci.upper,
                   group = moderator.value,
                   fill = factor(moderator.value),
                   col = factor(moderator.value))) +
  facet_wrap(~ label, ncol = 4) +
  geom_ribbon(alpha = .5) +
  geom_line() +
  scale_x_continuous(labels = function(x) ifelse(x == 1, "1\nREP", ifelse(x == 0, "0\nDEM", x))) +
  labs(y = "Expected Topic Proportion",
       fill = "Organization",
       title = "Topic Probabilities Over Time") +
  guides(col=FALSE) + 
  theme(legend.position = "bottom") +
  ylim(-.1, .6) + 
  scale_x_date(name = 'Date', breaks = waiver(), date_breaks = '2 weeks',
               labels = waiver(), date_labels = '%m/%d', minor_breaks = waiver(),
               date_minor_breaks = waiver(), limits = NULL, expand = waiver(),
               position = "bottom") +
  geom_vline(xintercept=as.Date('2018-02-14'), linetype="dotted") +
  geom_hline(yintercept=0, linetype="dotted") +
  scale_color_brewer(palette = "Dark2") + 
  scale_fill_brewer(palette = "Dark2")

# Plot single org, all topics
rights_list <- clean_df %>%
  filter(gun_control == 0) %>%
  select(name) %>%
  unique()
rights_list_test <- rights_list$name

control_list <- clean_df %>%
  filter(gun_control == 1) %>%
  select(name) %>%
  unique()
control_list_test <- control_list$name

full_list <- clean_df %>%
  select(name) %>%
  unique()
full_list_test <- full_list$name

time_effect <- lapply(full_list_test, function(i) {extract.estimateEffect(x = prep3,
                                                                          covariate = "numeric_date",
                                                                          method = "continuous",
                                                                          model = chosen_model,
                                                                          labeltype = "custom",
                                                                          custom.labels = selected_labels,
                                                                          n = 6,
                                                                          moderator = "name",
                                                                          moderator.value = i)
})
time_effect <- do.call("rbind", time_effect)
time_effect$month <- as.Date(time_effect$covariate.value, origin = "1970-01-01")

# Here's the graphs with the LApply Function
lapply(full_list_test, function(i) {filter(time_effect, moderator.value == i) %>%
    ggplot(aes(x = month, y = estimate,
                           ymin = ci.lower, ymax = ci.upper,
                           group = moderator.value,
                           fill = factor(moderator.value),
                           col = factor(moderator.value))) +
    facet_wrap(~ label, ncol = 5) +
    geom_ribbon(alpha = .5) +
    geom_line() +
    scale_x_continuous(labels = function(x) ifelse(x == 1, "1\nREP", ifelse(x == 0, "0\nDEM", x))) +
    labs(y = "Expected Topic Proportion",
         fill = "Organization",
         title = paste(i, ": Topic Probabilities Over Time")) +
    guides(col=FALSE) + 
    theme(legend.position = "bottom") +
    ylim(-.1, .75) + 
    scale_x_date(name = 'Date', breaks = waiver(), date_breaks = '2 weeks',
                 labels = waiver(), date_labels = '%m/%d', minor_breaks = waiver(),
                 date_minor_breaks = waiver(), limits = NULL, expand = waiver(),
                 position = "bottom") +
    geom_vline(xintercept=as.Date('2018-02-14'), linetype="dotted") +
    geom_hline(yintercept=0, linetype="dotted") +
    scale_color_brewer(palette = "Dark2") + 
    scale_fill_brewer(palette = "Dark2")})

# And for each topic
lapply(selected_labels, function(i) {filter(time_effect, label == i) %>%
    ggplot(aes(x = month, y = estimate,
               #ymin = ci.lower, ymax = ci.upper,
               group = label,
               fill = factor(label),
               col = factor(label))) +
    facet_wrap(~ moderator.value, ncol = 8) +
    #geom_ribbon(alpha = .5) +
    geom_line() +
    scale_x_continuous(labels = function(x) ifelse(x == 1, "1\nREP", ifelse(x == 0, "0\nDEM", x))) +
    labs(y = "Expected Topic Proportion",
         fill = "Organization",
         title = paste(i, ": Topic Probabilities Over Time")) +
    guides(col=FALSE) + 
    theme(legend.position = "bottom") +
    ylim(-.1, 1) + 
    scale_x_date(name = 'Date', breaks = waiver(), date_breaks = '2 weeks',
                 labels = waiver(), date_labels = '%m/%d', minor_breaks = waiver(),
                 date_minor_breaks = waiver(), limits = NULL, expand = waiver(),
                 position = "bottom") +
    geom_vline(xintercept=as.Date('2018-02-14'), linetype="dotted") +
    geom_hline(yintercept=0, linetype="dotted") +
    scale_color_brewer(palette = "Dark2") + 
    scale_fill_brewer(palette = "Dark2")})


# Plot with GG Plot
ggplot(time_effect, aes(x = month, y = estimate,
                        ymin = ci.lower, ymax = ci.upper,
                        group = moderator.value,
                        fill = factor(moderator.value),
                        col = factor(moderator.value))) +
  facet_wrap(~ label, ncol = 8) +
  #geom_ribbon(alpha = .2) +
  geom_line() +
  scale_x_continuous(labels = function(x) ifelse(x == 1, "1\nREP", ifelse(x == 0, "0\nDEM", x))) +
  labs(y = "Expected Topic Proportion",
       col = "Organization",
       title = "Topic Probabilities Over Time") +
  guides(fill=FALSE) + 
  theme(legend.position = "bottom") +
  ylim(-.1, 1) + 
  scale_x_date(name = 'Date', breaks = waiver(), date_breaks = '2 weeks',
               labels = waiver(), date_labels = '%m/%d', minor_breaks = waiver(),
               date_minor_breaks = waiver(), limits = NULL, expand = waiver(),
               position = "bottom") +
  geom_vline(xintercept=as.Date('2018-02-14'), linetype="dotted") +
  geom_hline(yintercept=0, linetype="dotted") +
  scale_color_brewer(palette = "Paired") + 
  scale_fill_brewer(palette = "Paired")

#### Covariates - Network
mod.out.corr <- topicCorr(chosen_model)
plot(mod.out.corr)

#### Word Cloud
cloud(chosen_model, topic=2)

#### Word Frequency by covariate
plot(chosen_model, type="perspectives", topics=7)

#### Convergence
plot(chosen_model$convergence$bound, type="l", ylab="Approximate Objective", 
     main="Convergence")

#### Interactive
stmCorrViz(chosen_model, "stm-interactive-correlation.html", 
           documents_raw=meta$clean_text, documents_matrix=out$documents)
